---
output: html_document
editor_options: 
  chunk_output_type: inline
---

\clearpage

# Coding of Twitter accounts

\clearpage

```{r include = TRUE, results = 'asis'}

farright_lists <- 
  read.csv("data/fringe_lists.csv",
           colClasses = rep("character",10))

# farright_lists %>%
#   dplyr::distinct(list_id) %>%
#   summary() %>%
#   knitr::kable(format = 'latex')

farright_lists %>%
  dplyr::select(-name, -slug, -description, -user_id, -creator_id) %>%
  kableExtra::kbl(booktabs = T, longtable = T, 
                  caption = "Twitter lists used to code far-right accounts (n=210)") %>%
  kableExtra::kable_styling(latex_options = c("striped", "scale_down", "repeat_header"),
                            font_size = 8)

```

```{r include = TRUE, results = 'asis'}

journalist_lists <- 
  read.csv("data/journalist.lists.csv",
           colClasses = rep("character",2))

# journalist_lists %>%
#   dplyr::distinct(list_id) %>%
#   summary() %>%
#   knitr::kable(format = 'latex')

journalist_lists %>%
  kableExtra::kbl(booktabs = T, longtable = T,
                  caption = "Twitter lists used to code journalist accounts (n=5)") %>%
  kableExtra::kable_styling(latex_options = c("striped", "scale_down", "repeat_header"),
                            font_size = 8)

```

```{r include = TRUE, results = 'asis'}

politician_lists <- 
  read.csv("data/politician_lists.csv",
           colClasses = rep("character",2))

# politician_lists %>%
#   dplyr::distinct(list_id) %>%
#   summary() %>%
#   knitr::kable(format = 'latex')

politician_lists %>%
  kableExtra::kbl(booktabs = T, longtable = T,
                  caption = "Twitter lists used to code politician accounts (n=10)") %>%
  kableExtra::kable_styling(latex_options = c("striped", "scale_down", "repeat_header"),
                            font_size = 8)

```

\FloatBarrier

```{r, include = TRUE, results = 'asis'}

load("data/coded_user_ids_descriptions.RData")

data.frame("far-right accounts" = length(farright_user_ids),
           "journalist accounts" = length(journalist_user_ids),
           "politician accounts" = length(politician_user_ids)) %>%
  knitr::kable(format = 'latex', col.names = c("far-right accounts",
                                               "journalist accounts",
                                               "politician accounts"),
               caption = "Twitter accounts coded by category.")

```

\clearpage

# Inverse document frequency (idf) of profile descriptions

Profile descriptions of far-right accounts were processed with the following function, which after pre-processing texts computes idf for each word-document, computes the collection average of idf scores for each word and, finally, rescale the average idf score on the range 1:0 to produce the `weight`s showed in the following tables. 

```{r eval = F, echo = T}

farright_user_descriptions_word <- 
  farright_user_descriptions %>%
  dplyr::mutate(user_description =  
                  removeWords(tolower(user_description), 
                              words = c(stopwords(kind = "en"),
                                        "http", "https", "t.co", "co",
                                        "don’t", "bit.ly"))) %>%
  tidytext::unnest_tokens(word, user_description, token = "words") %>%
  dplyr::filter(!grepl("^[0-9]+$", word)) %>%
  dplyr::count(id, word) %>%
  dplyr::ungroup() %>%
  tidytext::bind_tf_idf(word, id, n) %>%
  dplyr::group_by(word) %>% 
  dplyr::summarise(idf = mean(idf)) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(score = scales::rescale(idf, to = c(1, 0))) %>%
  dplyr::filter(!is.na(word))

```

```{r, include = T, results = 'asis'}

this_table <-
  farright_user_descriptions_word %>%
  dplyr::top_n(60, wt = weight) %>%
  dplyr::arrange(desc(weight))

knitr::kable(
  list(this_table[1:40,], this_table[41:71,]),
  booktabs = TRUE, format = "latex",
  col.names = rep(c("word", "idf", "score"), 1),
  caption = "Most common terms found among the 208 profile descriptions of far-right accounts and relative score.") %>%
  kableExtra::kable_styling(latex_options = c("striped"), font_size = 10)

```

```{r, include = T, results = 'asis'}

this_table <-
  climateaction_user_descriptions_word %>%
  dplyr::top_n(60, wt = weight) %>%
  dplyr::arrange(desc(weight))

knitr::kable(
  list(this_table[1:40,], this_table[41:71,]),
  booktabs = TRUE, format = "latex",
  col.names = rep(c("word", "idf", "score"), 1),
    caption = "Most common terms found among the 200 profile descriptions of \\#Climateaction accounts and relative score.") %>%
  kableExtra::kable_styling(latex_options = c("striped"), font_size = 10)

```

```{r include = T, fig.width = 6, fig.height = 4, fig.cap = "Distribution of account's scores calculated from the Inverse document frequency of terms in the profile descriptions."}

all_user_description_weight %>%
  ggplot(aes(x = far_right, y = weight)) +
  geom_boxplot(coef = 10) +
  geom_jitter(data = all_user_description_weight 
             %>%
               dplyr::filter(far_right == TRUE), 
             colour = 'blue', alpha = .5)

```

\clearpage

# The Australian far-right on Twitter

\clearpage

```{r compare_farright_behaviour, include = T, fig.width = 6, fig.height = 4, fig.cap = "Distribution of number of tweets, retweets and replies by user.", cache = T}

far_right_behaviour.df <-
  bushfire_tweets.dt %>%
  dplyr::mutate(far_right = user_id %in% farright_user_ids) %>%
  dplyr::group_by(far_right, user_id) %>%
  dplyr::summarize(n_tweets = n(),
                   n_retweets = sum(!is.na(retweeted_status_id)),
                   n_replies = sum(!is.na(in_reply_to_status_id))) %>%
  dplyr::mutate(when = "Bushfires") %>%
  bind_rows(
    covid_tweets.dt %>%
      dplyr::mutate(far_right = user_id %in% farright_user_ids) %>%
      dplyr::group_by(far_right, user_id) %>%
      dplyr::summarize(n_tweets = n(),
                       n_retweets = sum(!is.na(retweeted_status_id)),
                       n_replies = sum(!is.na(in_reply_to_status_id))) %>%
      dplyr::mutate(when = "Covid-19"))

far_right_behaviour.df %>%
  tidyr::pivot_longer(n_tweets:n_replies) %>%
  dplyr::mutate(name = recode_factor(factor(name, levels = 
                                              c("n_tweets", "n_retweets", "n_replies")),
                                     n_tweets = "n tweets", 
                                     n_retweets = "n retweets",
                                     n_replies = "n replies"),
                far_right = recode_factor(
                  factor(far_right), 
                  `FALSE` = "Others", `TRUE` = "Far-right")) %>%
  ggplot(aes(x = name, y = value, fill = far_right)) +
  geom_boxplot(outlier.shape = NA) + 
  scale_fill_brewer(palette = "Set1") +
  labs(x = NULL, y = 'average, by user', fill = NULL) +
  facet_wrap('when') +
  scale_y_continuous(trans="pseudo_log", 
                     limits = c(0,4000), breaks = c(0,10,1000,4000)) +
  theme_bw()

```

Far-right accounts also consistently retweeted each other. Their retweet network is structurally dense and justifies characterising these accounts as a social media community. Among the far-right accounts included in the retweet network mapped during the bushfires and the COVID-19 conversation, 66% (Period 1) and 57% (Period 2) retweeted another far-right account or were retweeted by another far-right account at least once, while 13% (in both periods) reciprocated a retweet from another far-right account at least once (highlighted in Figure 10). 


```{r farright_rt_g, include = T, fig.width = 9, fig.height = 8, fig.cap = "Network of retweets among far-right accounts and accounts engaging in mutual retweeting", cache = T}

bushfire_farright_retweet.g <-
  bushfire_tweets.dt %>%
  dplyr::select(user_id, retweeted_status_user_id) %>%
  dplyr::filter(!is.na(retweeted_status_user_id) &
                  (user_id %in% farright_user_ids & 
                     retweeted_status_user_id %in% farright_user_ids)) %>%
  igraph::graph_from_data_frame(vertices = 
                                  data.frame(id = 
                                               farright_user_ids[
                                                 farright_user_ids %in% 
                                                   unique(bushfire_tweets.dt$user_id) |
                                                   farright_user_ids %in%
                                                   unique(bushfire_tweets.dt$retweeted_status_user_id)])) %>%
  igraph::set_edge_attr(name = "weight", value = 1) %>%
  igraph::simplify()

E(bushfire_farright_retweet.g)$mutual <- 
  igraph::is.mutual(bushfire_farright_retweet.g)

bushfire_farright_retweet.layout <- 
  igraph::layout_with_kk(bushfire_farright_retweet.g,
                         weight = sqrt(E(bushfire_farright_retweet.g)$weight))


covid_farright_retweet.g <-
  covid_tweets.dt %>%
  dplyr::select(user_id, retweeted_status_user_id) %>%
  dplyr::filter(!is.na(retweeted_status_user_id) &
                  (user_id %in% farright_user_ids & 
                     retweeted_status_user_id %in% farright_user_ids)) %>%
  igraph::graph_from_data_frame(vertices = 
                                  data.frame(id = 
                                               farright_user_ids[
                                                 farright_user_ids %in% 
                                                   unique(covid_tweets.dt$user_id) |
                                                   farright_user_ids %in%
                                                   unique(covid_tweets.dt$retweeted_status_user_id)])) %>%
  igraph::set_edge_attr(name = "weight", value = 1) %>%
  igraph::simplify()

E(covid_farright_retweet.g)$mutual <- 
  igraph::is.mutual(covid_farright_retweet.g)

covid_farright_retweet.layout <- 
  igraph::layout_with_kk(covid_farright_retweet.g,
                         weight = sqrt(E(covid_farright_retweet.g)$weight))

V(bushfire_farright_retweet.g)$mutual_degree <- 
  igraph::degree(bushfire_farright_retweet.g %>%
                   igraph::delete.edges(which(!E(bushfire_farright_retweet.g)$mutual)))

V(covid_farright_retweet.g)$mutual_degree <- 
  igraph::degree(covid_farright_retweet.g %>%
                   igraph::delete.edges(which(!E(covid_farright_retweet.g)$mutual)))

cowplot::plot_grid(
  bushfire_farright_retweet.g %>%
    ggnetwork(layout = bushfire_farright_retweet.layout, arrow.gap = 0.01) %>%
    ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
    geom_edges(arrow = arrow(length = unit(2, "pt"), type = "closed"),
               colour = "gray60", size = .1) +
    geom_nodes(size = 2.5, shape = 21, colour = 'gray90',
               aes(fill = mutual_degree>0)) +
    scale_fill_brewer(palette = "Dark2", labels = c("no mutual tie",
                                                    "1+ mutual ties")) +
    theme_blank() +
    labs(fill = NULL),
  # bushfire_farright_retweet.g %>%
  #   igraph::delete.edges(which(!E(bushfire_farright_retweet.g)$mutual)) %>%
  #   ggnetwork(layout = bushfire_farright_retweet.layout, arrow.gap = 0.01) %>%
  #   ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
  #   geom_edges(arrow = arrow(length = unit(2, "pt"), type = "closed"),
  #              colour = "gray60", size = .1) +
  #   geom_nodes(size = 2, shape = 21,
  #              fill = 'orange') +
  #   theme_blank(),
  covid_farright_retweet.g %>%
    ggnetwork(layout = covid_farright_retweet.layout, arrow.gap = 0.01) %>%
    ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
    geom_edges(arrow = arrow(length = unit(2, "pt"), type = "closed"),
               colour = "gray60", size = .1) +
    geom_nodes(size = 2.5, shape = 21, colour = 'gray80',
               aes(fill = mutual_degree>0)) +
    scale_fill_brewer(palette = "Dark2", labels = c("no mutual tie",
                                                    "1+ mutual ties")) +
    theme_blank() +
    labs(fill = NULL),
  # covid_farright_retweet.g %>%
  #   igraph::delete.edges(which(!E(covid_farright_retweet.g)$mutual)) %>%
  #   ggnetwork(layout = covid_farright_retweet.layout, arrow.gap = 0.01) %>%
  #   ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
  #   geom_edges(arrow = arrow(length = unit(2, "pt"), type = "closed"),
  #              colour = "gray60", size = .1) +
  #   geom_nodes(size = 2, shape = 21,
  #              fill = 'orange') +
  #   theme_blank(),
  ncol = 1, labels = c("Bushfires", "Covid-19"))

```

```{r include = T, results = 'asis'}

tmp_df <- 
  
  data.frame(
    
    sum(igraph::degree(bushfire_farright_retweet.g)>0)/vcount(bushfire_farright_retweet.g) * 100,
    
    sum(igraph::degree(covid_farright_retweet.g)>0)/vcount(covid_farright_retweet.g) * 100,
    
    sum(V(bushfire_farright_retweet.g)$mutual_degree>0)/vcount(bushfire_farright_retweet.g) * 100,
    
    sum(V(covid_farright_retweet.g)$mutual_degree>0)/vcount(covid_farright_retweet.g) * 100,
    
    edge_density(bushfire_farright_retweet.g) * 100,
    
    edge_density(covid_farright_retweet.g) * 100) %>%
  
  t() %>% data.frame()

rownames(tmp_df) <- 
  c("Period 1: more than one tie",
    "Period 2: more than one tie",
    "Period 1: more than one mutual tie",
    "Period 2: more than one mutual tie",
    "Period 1: edge density",
    "Period 2: edge density")
  
tmp_df %>%
  kbl(format = 'latex', 
      col.names = "%",
      caption = " Statistics of the far-right retweet network.",
      digits = 4) %>%
  kableExtra::kable_styling()

```

```{r compare_farright_urls, include = TRUE, fig.width = 7, fig.height = 4, fig.cap = "Tweets containing URLs to mainstream media websites and YouTube videos.", cache = T}

load("data/australia_news_sources.RData")

far_right_urls.df <-
  bushfire_tweets.dt %>%
  dplyr::mutate(far_right = user_id %in% farright_user_ids) %>%
  dplyr::left_join(
    bushfire_urls.dt %>%
      dplyr::group_by(tweet_id) %>%
      dplyr::summarize(mainstream = domain %in% 
                         australia_news_sources$domain,
                       youtube = domain == "youtu.be"),
    by = "tweet_id") %>%
  dplyr::group_by(far_right, user_id) %>%
  dplyr::summarize(n_tweets = n(),
                   n_URLs = sum(!is.na(mainstream)),
                   n_mainstream = sum(mainstream, na.rm = T),
                   n_youtube = sum(youtube, na.rm = T)) %>%
  dplyr::mutate(when = "Bushfire") %>%
  bind_rows(
    covid_tweets.dt %>%
      dplyr::mutate(far_right = user_id %in% farright_user_ids) %>%
      dplyr::left_join(
        covid_urls.dt %>%
          dplyr::group_by(tweet_id) %>%
          dplyr::summarize(mainstream = domain %in% 
                             australia_news_sources$domain,
                           youtube = domain == "youtu.be"),
        by = "tweet_id") %>%
      dplyr::group_by(far_right, user_id) %>%
      dplyr::summarize(n_tweets = n(),
                       n_URLs = sum(!is.na(mainstream)),
                       n_mainstream = sum(mainstream, na.rm = T),
                       n_youtube = sum(youtube, na.rm = T)) %>%
      dplyr::mutate(when = "Covid"))

far_right_urls.n_tweets <- 
  far_right_urls.df %>%
  dplyr::group_by(when) %>%
  dplyr::summarise(n_tweets = sum(n_tweets))

far_right_urls.df %>%
  dplyr::group_by(far_right, when) %>%
  dplyr::summarise(perc_mainstream = sum(n_mainstream) / sum(n_tweets),
                   perc_youtube = sum(n_youtube) / sum(n_tweets)) %>%
  tidyr::pivot_longer(perc_mainstream:perc_youtube) %>%
  dplyr::mutate(name = recode_factor(factor(name, levels = 
                                              c("perc_mainstream", "perc_youtube")),
                                     perc_mainstream = "perc. mainstream media",
                                     perc_youtube = "perc. YouTube"),
                far_right = recode_factor(
                  factor(far_right), 
                  `FALSE` = "Others", `TRUE` = "Far-right"),
                when = recode_factor(when, 
                                     Bushfire = sprintf("Bushfires (n tweets = %s)", 
                                                        format( 
                                                          far_right_urls.n_tweets$n_tweets[
                                                            far_right_urls.n_tweets$when == "Bushfire"],
                                                          big.mark=",", scientific=FALSE)), 
                                     Covid = sprintf("Covid-19 (n tweets = %s)", 
                                                        format( 
                                                          far_right_urls.n_tweets$n_tweets[
                                                            far_right_urls.n_tweets$when == "Covid"],
                                                          big.mark=",", scientific=FALSE)))) %>%
  ggplot(aes(x = name, y = value, fill = far_right)) +
  geom_bar(stat = 'identity', position = 'dodge') + 
  scale_y_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "Set1") +
  labs(x = NULL, fill = NULL, y = 'as % of total number of tweets') +
  facet_wrap('when') +
  theme_bw()

```

\clearpage

# Opinion frequency

We detected the presence of a set of opinions in tweets and user descriptions using a mixed approach that combines qualitative observations and advanced machine learning [redacted]. We first identified a set of opinions (see supplemental material) through a qualitative analysis approach. Second, we trained deep neural network-based classifiers to automatically predict the presence of these opinions in the twitter dataset. Finally, we iteratively refined the classifiers using a human-in-the-loop approach – the machine automatically selected texts it was most and least sure of and labelled them, after which these texts were manually validated and used in a new iteration of automatic training of the classifier. Notably, the classifier did not determine the polarity of a posting (so whether the author agrees or not with the opinion), only whether the opinion was discussed.

Through automatic text classification, we were able to assess how often opinions were discussed or mentioned in tweets from the far-right community and compare it with the frequency of these opinions in tweets published by other accounts. Overall, we notice more diversity in the content published during Covid-19. Through a combination of qualitative coding and quantitative topic modelling, we identified 43 opinions in tweets from Period 1 and 53 from Period 2. The classifier was able to associate 13% of tweets with at least an opinion in Period 1 and only 5% in Period 2.


```{r include = T, results = "asis"}

predicted_opinions_twt %>%
  dplyr::group_by(opinion_label) %>%
  dplyr::count() %>%
  dplyr::arrange(desc(n)) %>%
  kableExtra::kbl(format = 'latex',
                  booktabs = T, longtable = T,
                  caption = "Predicted opinions and their frequency") %>%
  kable_styling(latex_options = c("striped", "repeat_header"),
                font_size = 10) %>%
  column_spec(1, width = "30em")
 
```


```{r include = T, results = "asis"}

tmp_df <- 
  data.frame( 
    
    sum(bushfire_tweets.dt$tweet_id[is.na(bushfire_tweets.dt$retweeted_status_id)] %in% unique(predicted_opinions_twt$post_url)) /
      sum(is.na(bushfire_tweets.dt$retweeted_status_id)), # 0.1275142
    
    length(unique(predicted_opinions_twt$opinion_label[predicted_opinions_twt$post_url %in% bushfire_tweets.dt$tweet_id])), # 43
    
    sum(covid_tweets.dt$tweet_id[is.na(covid_tweets.dt$retweeted_status_id)] %in% unique(predicted_opinions_twt$post_url)) /
      sum(is.na(covid_tweets.dt$retweeted_status_id)), # 0.04793013
    
    length(unique(predicted_opinions_twt$opinion_label[predicted_opinions_twt$post_url %in% covid_tweets.dt$tweet_id])) # 53
    
  ) %>% t() %>% data.frame()

rownames(tmp_df) <- 
  c("Period 1: % tweets with one or more opinions",
    "Period 1: n opinions in tweets",
    "Period 2: % tweets with one or more opinions",
    "Period 2: n opinions in tweets")

tmp_df %>%
  kbl(format = 'latex', 
      booktabs = T,
      col.names = "%/n",
      caption = "Opinions automatically identified in the data",
      digits = 4) %>%
  kableExtra::kable_styling()

```

\FloatBarrier

```{r bushfire_opinions_far_right_comparison, fig.width = 12, fig.height = 6, fig.cap = "Period 1: Automatically identified opinions"}

bushfire_tweets_wt_opinions.dt <-
  bind_rows(
    bushfire_tweets.dt %>%
      dplyr::filter(is.na(retweeted_status_id)) %>%
      dplyr::select(user_id, tweet_id) %>%
      dplyr::left_join(predicted_opinions_twt %>%
                         dplyr::filter(dataset == 'bushfire') %>%
                         tidyr::pivot_wider(id_cols = post_url, 
                                            names_from = 'opinion_label', 
                                            values_from = 'opinion_label') %>%
                         dplyr::mutate_at(2:44,  ~ if_else(is.na(.x), FALSE, TRUE)),
                       by = c("tweet_id" = 'post_url')) %>%
      dplyr::mutate_at(3:45,  ~ if_else(is.na(.x), FALSE, .x)),
    bushfire_tweets.dt %>%
      dplyr::filter(!is.na(retweeted_status_id)) %>%
      dplyr::select(user_id, retweeted_status_id) %>%
      dplyr::left_join(predicted_opinions_twt %>%
                         dplyr::filter(dataset == 'bushfire') %>%
                         tidyr::pivot_wider(id_cols = post_url, 
                                            names_from = 'opinion_label', 
                                            values_from = 'opinion_label') %>%
                         dplyr::mutate_at(2:44,  ~ if_else(is.na(.x), FALSE, TRUE)),
                       by = c("retweeted_status_id" = 'post_url')) %>%
      dplyr::mutate_at(3:45,  ~ if_else(is.na(.x), FALSE, .x)))

bushfire_tweets_wt_opinions.dt$far_right <- 
  bushfire_tweets_wt_opinions.dt$user_id %in% farright_user_ids

bushfire_tweets_wt_opinions.perc <- 
  bushfire_tweets_wt_opinions.dt %>%
  dplyr::group_by(far_right) %>%
  dplyr::summarize_at(3:45, ~ sum(.x) / n()) %>%
  tidyr::pivot_longer(cols = 2:44) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(far_right) %>%
  dplyr::mutate(rank = order(order(value, decreasing=T)))

these_levels.df <- 
  bushfire_tweets_wt_opinions.perc %>%
  dplyr::filter(far_right) %>%
  dplyr::arrange(rank)

bushfire_tweets_wt_opinions.perc <-
  bushfire_tweets_wt_opinions.perc %>%
  dplyr::mutate(name = factor(name, levels = rev(these_levels.df$name)))

bushfire_tweets_wt_opinions.perc %>%
  dplyr::filter(name %in% bushfire_tweets_wt_opinions.perc$name[bushfire_tweets_wt_opinions.perc$rank<21]) %>%
  ggplot(aes(x = name, y = value, fill = far_right)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  scale_fill_brewer(palette = "Set1", labels = c("other", "far-right")) +
  theme_bw() +
  labs(fill = NULL, y = "of total number of tweets published (including retweets)", x = NULL)

```

\clearpage

Figure 8 illustrates the proportion of tweets including one of the 20 most frequent opinions expressed during the Covid-19 conversation. Over 2% of the tweets mentioned the opinion that mainstream media are not to be trusted. Among the most frequent opinions we identified a few that could potentially relate to the diffusion of misleading and dangerous information disputing or minimising the seriousness of the Covid-19 disease in violation of Twitter policies such as the third most frequent opinion (found in 1.6% of far-right tweets) about Covid-19 being a plan set up by the elites or the eighth most frequent about Covid-19 being not a real or serious illness.  It is also interesting to note how the most popular far-right opinions on Twitter are about the delegitimisation of epistemic institutions (elites, universities) and those communicating epistemic outcomes (the mainstream media) or about framing information (Covid-19, 5G) through alternative epistemic outcomes mostly in the form of conspiracy theories. 

```{r covid_opinions_far_right_comparison, fig.width = 12, fig.height = 6, fig.cap = "Period 2: Automatically identified opinions", cache = T}

covid_tweets_wt_opinions.dt <-
  bind_rows(
    covid_tweets.dt %>%
      dplyr::filter(is.na(retweeted_status_id)) %>%
      dplyr::select(user_id, tweet_id) %>%
      dplyr::left_join(predicted_opinions_twt %>%
                         dplyr::filter(dataset == 'covid') %>%
                         tidyr::pivot_wider(id_cols = post_url, 
                                            names_from = 'opinion_label', 
                                            values_from = 'opinion_label') %>%
                         dplyr::mutate_at(2:54,  ~ if_else(is.na(.x), FALSE, TRUE)),
                       by = c("tweet_id" = 'post_url')) %>%
      dplyr::mutate_at(3:55,  ~ if_else(is.na(.x), FALSE, .x)),
    covid_tweets.dt %>%
      dplyr::filter(!is.na(retweeted_status_id)) %>%
      dplyr::select(user_id, retweeted_status_id) %>%
      dplyr::left_join(predicted_opinions_twt %>%
                         dplyr::filter(dataset == 'covid') %>%
                         tidyr::pivot_wider(id_cols = post_url, 
                                            names_from = 'opinion_label', 
                                            values_from = 'opinion_label') %>%
                         dplyr::mutate_at(2:54,  ~ if_else(is.na(.x), FALSE, TRUE)),
                       by = c("retweeted_status_id" = 'post_url')) %>%
      dplyr::mutate_at(3:55,  ~ if_else(is.na(.x), FALSE, .x)))

covid_tweets_wt_opinions.dt$far_right <- 
  covid_tweets_wt_opinions.dt$user_id %in% farright_user_ids

covid_tweets_wt_opinions.perc <- 
  covid_tweets_wt_opinions.dt %>%
  dplyr::group_by(far_right) %>%
  dplyr::summarize_at(3:55, ~ sum(.x) / n()) %>%
  tidyr::pivot_longer(cols = 2:54) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(far_right) %>%
  dplyr::mutate(rank = order(order(value, decreasing=T)))

these_levels.df <- 
  covid_tweets_wt_opinions.perc %>%
  dplyr::filter(far_right) %>%
  dplyr::arrange(rank)

covid_tweets_wt_opinions.perc <-
  covid_tweets_wt_opinions.perc %>%
  dplyr::mutate(name = factor(name, levels = rev(these_levels.df$name)))

       covid_tweets_wt_opinions.perc %>%
         dplyr::filter(name %in% covid_tweets_wt_opinions.perc$name[covid_tweets_wt_opinions.perc$rank<21]) %>%
         ggplot(aes(x = name, y = value, fill = far_right)) +
         geom_bar(stat = 'identity', position = 'dodge') +
         scale_y_continuous(labels = scales::percent) +
         coord_flip() +
         scale_fill_brewer(palette = "Set1", labels = c("other (n=4,997,666)", "far-right (n=40,642)")) +
         theme_bw() +
         labs(fill = NULL, y = "of total number of tweets published (including retweets)", x = NULL)

```


```{r}

opinion_distribution.df <-
  predicted_opinions_twt %>%
  dplyr::group_by(period = case_when(time < as.Date("2020-02-01") ~ "bushfires",
                                     time > as.Date("2020-02-01") ~ "Covid-19"),
                  post_url) %>%
  dplyr::summarize(n_topics = length(unique(opinion_label)),
                   n_char = nchar(unique(gsub("\\s+", "", gsub("@[A-Za-z0-9_]{4,15}", "", text))))) %>%
  dplyr::bind_rows(bushfire_tweets.dt %>%
                     dplyr::filter(!tweet_id %in% predicted_opinions_twt$post_url) %>%
                     dplyr::mutate(post_url = tweet_id,
                                   n_topics = 0,
                                   period = "bushfires",
                                   n_char = nchar(gsub("\\s+", "", 
                                                              gsub("@[A-Za-z0-9_]{4,15}", "", 
                                                                   text)))) %>%
                     dplyr::select(post_url, n_topics, period, n_char)
  ) %>%
  dplyr::bind_rows(covid_tweets.dt %>%
                     dplyr::filter(!tweet_id %in% predicted_opinions_twt$post_url) %>%
                     dplyr::mutate(post_url = tweet_id,
                                   n_topics = 0,
                                   period = "Covid-19",
                                   n_char = nchar(gsub("\\s+", "", 
                                                              gsub("@[A-Za-z0-9_]{4,15}", "", 
                                                                   text)))) %>%
                     dplyr::select(post_url, n_topics, period, n_char))




opinion_distribution.df$rt <- 
  opinion_distribution.df$post_url %in% 
  c(bushfire_tweets.dt$tweet_id[!is.na(bushfire_tweets.dt$retweeted_status_id)], 
                                          covid_tweets.dt$tweet_id[!is.na(covid_tweets.dt$retweeted_status_id)])

```


```{r include = T, results = 'asis'}

opinion_distribution.df %>%
  dplyr::filter(n_char > 100) %>%
  dplyr::group_by(period) %>%
  dplyr::summarize(n = sum(n_topics>0),
                   `%` = round(sum(n_topics>0) / n() * 100, 2)) %>%
  kableExtra::kbl(booktabs = T,
                  caption = "Tweets associated to at least one opinion")

```

```{r include = T, results = 'asis'}

opinion_distribution.df %>%
  dplyr::filter(!(rt)) %>%
  dplyr::group_by(period) %>%
  dplyr::summarize(n = sum(n_topics>0),
                   `%` = round(sum(n_topics>0) / n() * 100, 2)) %>%
  kableExtra::kbl(booktabs = T,
                  caption = "Tweets associated to at least one opinion (excluding RTs)")

```



